library(qtlcharts)
library(CountClust)
library(parallel)
library(cellcycleR)
library(data.table)
library(binhf)
library(vioplot)
library(limma)
library(readxl)
library(Humanzee)
library(mygene)
library(knitr)

Background and objectives

After fitting sinusoidal model to Gilad2015 iPSC NA19239 (link1, we fitted PCA to the same data and compare fitting results.

Conclusions

We fit the model under the scenarios of 1) all annotated cell-cycle genes, 2) high SNR from cell-cycle genes, 3) best set from the high-SNR genes. In all scenarios, PCA performs poorly…

Data prepration

Import final batch-corrected iPSC data

molecules_final <- read.table("../data/gilad-2015/molecules-final.txt",
                              header = TRUE,
                              stringsAsFactors = FALSE)
anno_filter <- read.table("../data/gilad-2015/annotation-filter.txt",
                          header = TRUE,
                          stringsAsFactors = FALSE)
dim(molecules_final)
## [1] 12192   560
table(anno_filter$individual)
## 
## NA19098 NA19101 NA19239 
##     141     199     220

Extract only data of one individual

molecules_final_subset <- molecules_final[ , anno_filter$individual == "NA19239"]
dim(molecules_final_subset)
## [1] 12192   220
molecules_final_subset <- molecules_final_subset

Import cell-cycle genes

cellcycle_genes <- read.table("../data/gilad-2015/cellcyclegenes.txt",
                               header = TRUE,
                               sep = "\t",
                               stringsAsFactors = FALSE)
colnames(cellcycle_genes) <- c("G1.S","S","G2","G2.M","M.G1")

Extract only cell-cycle genes

which_cell_cycle <- which (rownames(molecules_final_subset) %in% unlist(cellcycle_genes))
cycle_data <- t(molecules_final_subset[which_cell_cycle, ])
dim(cycle_data)
## [1] 220 536

Standardize expression levels into z-scores for for gene. This step is for easy visualization of expression levels in plots.

cycle_data_normed <- apply(cycle_data, 2, 
                            function(x)  return (x-mean(x))/sd(x))

Fitting on cell-cycle genes

if (file.exists("../rdas/yoav_cycleR_cellcycle_genes-2016-01-28/cellorder-ipsc-2016-01-28.rda")) {
    load("../rdas/yoav_cycleR_cellcycle_genes-2016-01-28/cellorder-ipsc-2016-01-28.rda")
} else {
results <- sin_cell_ordering_class(cycle_data_normed, 
                                   celltime_levels = 300,
                                   num_iter = 300)
save(results, 
     file ="../rdas/yoav_cycleR_cellcycle_genes-2016-01-28/cellorder-ipsc-2016-01-28.rda")
}
str(results)
## List of 6
##  $ cell_times      : num [1:220] 5.758 2.753 4.287 0.357 3.257 ...
##  $ amp             : num [1:536] 0.0689 0.256 0.1392 0.0721 0.3354 ...
##  $ phi             : num [1:536] 1.312 0.746 1.426 1.245 1.873 ...
##  $ sigma           : num [1:536] 0.434 0.828 0.647 0.546 1.456 ...
##  $ loglik          : num -148111
##  $ signal_intensity: num [1:220, 1:300] -698 -781 -687 -667 -859 ...

Post-processing of cell-phase order.

cell_order_full <- cell_ordering_full(results$signal_intensity, dim(molecules_final)[2])
str(cell_order_full)
##  num [1:220(1d)] 5.625 2.732 4.413 0.226 2.972 ...

Model estimates

We look at the plots of the amplitudes, phases and the non signal variation of the genes.

amp_genes <- results$amp
sd_genes <- results$sigma
phi_genes <- results$phi

par(mfrow=c(2,2))
plot(density(phi_genes), col="red", 
     main="Density plot of the phases")
plot(density(amp_genes[-which.max(amp_genes)]), col="red", 
     main="Density plot of the amplitudes")
plot(density(sd_genes[-which.max(sd_genes)]), col="red", 
     main="Density plot of the non-signal sd")

ESS <- amp_genes^2
RSS <- sd_genes^2
SNR <- ESS/RSS
plot(SNR, col="red", pch=20, lwd=1)

PCA vs. Re-ordered expression pattern

After re-ordering.

new_cell_order <- binhf::shift(order(cell_order_full), 130, dir = "right")
iplotCurves(t(cycle_data_normed[new_cell_order, ]) )
## Set screen size to height=700 x width=1000

pca_cycle <- prcomp(cycle_data_normed[new_cell_order, ], center=TRUE, scale. = TRUE)
par(mfrow= c(2,2))
plot(pca_cycle$x[,1], pca_cycle$x[,2], pch=20, lwd=0.01)
plot(pca_cycle$x[,1], pca_cycle$x[,3], pch=20, lwd=0.01)
plot(pca_cycle$x[,2], pca_cycle$x[,3], pch=20, lwd=0.01)

Re-fitting on high SNR genes

We extract the high SNR genes as they seem to have sinusoidal patterns and repeat the procedure again.

snr_high_indices <- which(SNR > .1)
cycle_data_normed_high_snr <- cycle_data_normed[ ,snr_high_indices]
dim(cycle_data_normed_high_snr)
## [1] 220  60

Modeing fitting…

if (file.exists("../rdas/yoav_cycleR_cellcycle_genes-2016-01-28/cellorder-ipsc-2016-01-28-high-snr.rda")) {
    load("../rdas/yoav_cycleR_cellcycle_genes-2016-01-28/cellorder-ipsc-2016-01-28-high-snr.rda")
} else {
results_high_snr <- sin_cell_ordering_class(cycle_data_normed_high_snr, 
                                        celltime_levels = 300, num_iter=200)
save(results_high_snr, 
     file = "../rdas/yoav_cycleR_cellcycle_genes-2016-01-28/cellorder-ipsc-2016-01-28-high-snr.rda")
}

Post-processing

cell_order_full <- cell_ordering_full(results_high_snr$signal_intensity,
                                      dim(cycle_data_normed_high_snr)[2])

Model estimates

We plot the same features as above and check for the robustness. We needed to shift the cell order so as to compare with previous plot on all genes as the method is non-identifiable upto a rotation.

amp_genes <- results_high_snr$amp;
sd_genes <- results_high_snr$sigma;
phi_genes <- results_high_snr$phi;

par(mfrow = c(2,2))
plot(density(phi_genes), col="red", main="Density plot of the phases")
plot(density(amp_genes[-which.max(amp_genes)]), col="red", main="Density plot of the amplitudes")
plot(density(sd_genes[-which.max(sd_genes)]), col="red", main="Density plot of the non-signal sd")

ESS <- amp_genes^2; RSS <- sd_genes^2
SNR <- ESS/RSS;
plot(SNR, col="red", pch=20, lwd=1)

PCA vs. Re-ordered expression patterns

new_cell_order <- binhf::shift(order(cell_order_full), 130, dir = "right")
iplotCurves(t(cycle_data_normed_high_snr[new_cell_order, ]) ) 

pca_cycle <- prcomp(cycle_data_normed_high_snr[new_cell_order, ], 
                    center=TRUE, scale. = TRUE);
par(mfrow= c(2,2))
plot(pca_cycle$x[,1], pca_cycle$x[,2], pch=20, lwd=0.01)
plot(pca_cycle$x[,1], pca_cycle$x[,3], pch=20, lwd=0.01)
plot(pca_cycle$x[,2], pca_cycle$x[,3], pch=20, lwd=0.01)

Re-fitting one more time…

snr_high_indices <- which(SNR > .5)
cycle_data_normed_high_snr_high <- cycle_data_normed_high_snr[ ,snr_high_indices]
dim(cycle_data_normed_high_snr_high)
## [1] 220  18

Modeing fitting…

if (file.exists("../rdas/yoav_cycleR_cellcycle_genes-2016-01-28/cellorder-ipsc-2016-01-28-high-snr-high.rda")) {
    load("../rdas/yoav_cycleR_cellcycle_genes-2016-01-28/cellorder-ipsc-2016-01-28-high-snr-high.rda")
} else {
results_high_snr_high <- sin_cell_ordering_class(cycle_data_normed_high_snr_high, 
                                        celltime_levels = 300, num_iter=200)
save(results_high_snr_high, 
     file = "../rdas/yoav_cycleR_cellcycle_genes-2016-01-28/cellorder-ipsc-2016-01-28-high-snr0high.rda")
}
## Final loglikelihood, iter 200 : -546.854341498

Post-processing

cell_order_full <- cell_ordering_full(results_high_snr_high$signal_intensity,
                                      dim(cycle_data_normed_high_snr)[2])

Model estimates

We plot the same features as above and check for the robustness. We needed to shift the cell order so as to compare with previous plot on all genes as the method is non-identifiable upto a rotation.

amp_genes <- results_high_snr_high$amp;
sd_genes <- results_high_snr_high$sigma;
phi_genes <- results_high_snr_high$phi;

par(mfrow = c(2,2))
plot(density(phi_genes), col="red", main="Density plot of the phases")
plot(density(amp_genes[-which.max(amp_genes)]), col="red", main="Density plot of the amplitudes")
plot(density(sd_genes[-which.max(sd_genes)]), col="red", main="Density plot of the non-signal sd")

ESS <- amp_genes^2; RSS <- sd_genes^2
SNR <- ESS/RSS;
plot(SNR, col="red", pch=20, lwd=1)

PCA vs. Re-ordered expression patterns

new_cell_order <- binhf::shift(order(cell_order_full), 130, dir = "right")
iplotCurves(t(cycle_data_normed_high_snr_high[new_cell_order, ]) ) 

pca_cycle <- prcomp(cycle_data_normed_high_snr_high[new_cell_order, ], 
                    center=TRUE, scale. = TRUE);
par(mfrow= c(2,2))
plot(pca_cycle$x[,1], pca_cycle$x[,2], pch=20, lwd=0.01)
plot(pca_cycle$x[,1], pca_cycle$x[,3], pch=20, lwd=0.01)
plot(pca_cycle$x[,2], pca_cycle$x[,3], pch=20, lwd=0.01)

Session information

sessionInfo()
## R version 3.2.1 (2015-06-18)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.10.5 (Yosemite)
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] knitr_1.11             mygene_1.2.3           GenomicFeatures_1.20.6
##  [4] AnnotationDbi_1.30.1   Biobase_2.28.0         GenomicRanges_1.20.8  
##  [7] GenomeInfoDb_1.4.3     IRanges_2.2.9          S4Vectors_0.6.6       
## [10] BiocGenerics_0.14.0    Humanzee_0.1.0         readxl_0.1.0          
## [13] limma_3.24.15          vioplot_0.2            sm_2.2-5.4            
## [16] binhf_1.0-1            adlift_1.3-2           EbayesThresh_1.3.2    
## [19] wavethresh_4.6.6       MASS_7.3-45            data.table_1.9.6      
## [22] cellcycleR_0.1.1       CountClust_0.1.0       qtlcharts_0.5-25      
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.0.0              jsonlite_0.9.19        
##  [3] splines_3.2.1           gsubfn_0.6-6           
##  [5] Formula_1.2-1           latticeExtra_0.6-26    
##  [7] Rsamtools_1.20.5        yaml_2.1.13            
##  [9] RSQLite_1.0.0           lattice_0.20-33        
## [11] chron_2.3-47            digest_0.6.8           
## [13] RColorBrewer_1.1-2      XVector_0.8.0          
## [15] colorspace_1.2-6        htmltools_0.3          
## [17] plyr_1.8.3              XML_3.98-1.3           
## [19] biomaRt_2.24.1          zlibbioc_1.14.0        
## [21] scales_0.3.0            BiocParallel_1.2.22    
## [23] sqldf_0.4-10            ggplot2_2.0.0          
## [25] nnet_7.3-11             proto_0.3-10           
## [27] survival_2.38-3         magrittr_1.5           
## [29] evaluate_0.8            foreign_0.8-66         
## [31] tools_3.2.1             formatR_1.2.1          
## [33] stringr_1.0.0           munsell_0.4.2          
## [35] cluster_2.0.3           lambda.r_1.1.7         
## [37] Biostrings_2.36.4       futile.logger_1.4.1    
## [39] grid_3.2.1              RCurl_1.95-4.7         
## [41] htmlwidgets_0.5         bitops_1.0-6           
## [43] rmarkdown_0.9.2         gtable_0.1.2           
## [45] DBI_0.3.1               R6_2.1.1               
## [47] GenomicAlignments_1.4.2 gridExtra_2.0.0        
## [49] rtracklayer_1.28.10     Hmisc_3.17-1           
## [51] futile.options_1.0.0    stringi_1.0-1          
## [53] Rcpp_0.12.2             rpart_4.1-10           
## [55] acepack_1.3-3.3